{
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {},
   "outputs": [],
   "source": [
    "# !wget https://huggingface.co/datasets/mesolitica/semisupervised-abstractive-summarization-ms-news/resolve/main/populate-news.json.semisupervised"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {},
   "outputs": [],
   "source": [
    "import os\n",
    "os.environ['CUDA_VISIBLE_DEVICES'] = ''"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {},
   "outputs": [],
   "source": [
    "import json\n",
    "import malaya\n",
    "from tqdm import tqdm\n",
    "from unidecode import unidecode\n",
    "import random"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {},
   "outputs": [],
   "source": [
    "import re\n",
    "\n",
    "def cleaning(string):\n",
    "    splitted = malaya.text.function.split_into_sentences(string)\n",
    "    if not len(splitted):\n",
    "        splitted = '. '.join([k.strip() for k in string.split('.') if len(k.strip())])\n",
    "    if splitted[0][0] == '-':\n",
    "        splitted[0] = splitted[0].replace('- ','')\n",
    "    points = [f'{no + 1}. {s}' for no, s in enumerate(splitted)]\n",
    "    points = ' '.join(points)\n",
    "    return points"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {},
   "outputs": [],
   "source": [
    "from sklearn.feature_extraction.text import TfidfVectorizer, CountVectorizer\n",
    "from malaya.text.vectorizer import SkipGramCountVectorizer, SkipGramTfidfVectorizer\n",
    "from sklearn.decomposition import TruncatedSVD, LatentDirichletAllocation\n",
    "\n",
    "stopwords = malaya.text.function.get_stopwords()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = SkipGramCountVectorizer(\n",
    "    max_df = 0.95,\n",
    "    min_df = 1,\n",
    "    ngram_range = (1, 3),\n",
    "    stop_words = stopwords,\n",
    "    skip = 2\n",
    ")\n",
    "svd = TruncatedSVD(n_components = 30)\n",
    "model = malaya.summarization.extractive.sklearn(svd, vectorizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {},
   "outputs": [],
   "source": [
    "vectorizer = SkipGramTfidfVectorizer(\n",
    "    max_df = 0.95,\n",
    "    min_df = 1,\n",
    "    ngram_range = (1, 3),\n",
    "    stop_words = stopwords,\n",
    "    skip = 2\n",
    ")\n",
    "svd = TruncatedSVD(n_components = 30)\n",
    "model_tfidf = malaya.summarization.extractive.sklearn(svd, vectorizer)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 43,
   "metadata": {},
   "outputs": [],
   "source": [
    "months = {\n",
    "    'january',\n",
    "    'jan',\n",
    "    'januari',\n",
    "    'february',\n",
    "    'feb',\n",
    "    'februari',\n",
    "    'march',\n",
    "    'mac',\n",
    "    'april',\n",
    "    'apr',\n",
    "    'may',\n",
    "    'mei',\n",
    "    'june',\n",
    "    'jun',\n",
    "    'july',\n",
    "    'julai',\n",
    "    'august',\n",
    "    'ogos',\n",
    "    'aug',\n",
    "    'september',\n",
    "    'sep',\n",
    "    'october',\n",
    "    'oktober',\n",
    "    'oct',\n",
    "    'november',\n",
    "    'nov',\n",
    "    'december',\n",
    "    'disember',\n",
    "    'dec',\n",
    "    'utusan',\n",
    "    'malaysiakini',\n",
    "    'astroawani',\n",
    "    'bernama',\n",
    "    'com',\n",
    "}"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 48,
   "metadata": {},
   "outputs": [],
   "source": [
    "def simple_cleaning(string):\n",
    "    return re.sub(r'[ ]+', ' ', unidecode(string).replace('\\n', ' ').replace('--', ' ').replace('/', ' ')).strip()"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 55,
   "metadata": {},
   "outputs": [
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "81717it [08:35, 158.52it/s]\n"
     ]
    }
   ],
   "source": [
    "before, after = [], []\n",
    "\n",
    "count = 0\n",
    "rejected = []\n",
    "languages = []\n",
    "accepted = []\n",
    "para = []\n",
    "malaysian_news = {\n",
    "    'kosmo',\n",
    "    'hmetro',\n",
    "    'malaymail',\n",
    "    'projekmm',\n",
    "    'bharian',\n",
    "    'utusan',\n",
    "    'astroawani',\n",
    "    'themalaysianinsight',\n",
    "    'malaysiakini',\n",
    "    'bernama'\n",
    "}\n",
    "\n",
    "def reject(data):\n",
    "    if data['news'] in malaysian_news:\n",
    "        return False\n",
    "    if any([n in data['top-image'] for n in malaysian_news]):\n",
    "        return False\n",
    "    if any([n in data['url'] for n in malaysian_news]):\n",
    "        return False\n",
    "    if 'com.my' in data['top-image']:\n",
    "        return False\n",
    "    if data['language'] == 'malay':\n",
    "        return False\n",
    "    if 'Siaran Pers' in data['news']:\n",
    "        return True\n",
    "    if '.id' in data['news']:\n",
    "        return True\n",
    "    \n",
    "    return True\n",
    "\n",
    "with open('populate-news.json.semisupervised') as fopen:\n",
    "    for l in tqdm(fopen):\n",
    "        data = json.loads(l)\n",
    "        text = re.sub(r'[ ]+', ' ', data['text']).strip()\n",
    "        if 'kindly register' in text.lower() or 'disabled in your browser' in text.lower():\n",
    "            continue\n",
    "        if len(text.split()) < 30:\n",
    "            continue\n",
    "            \n",
    "        # accepted.append(data)\n",
    "        summaries = [malaya.text.function.remove_empty_parenthesis(s) for s in data['semisupervised-summaries']]\n",
    "        for s in summaries:\n",
    "            if len(s.split()) > 20:\n",
    "                accepted.append((s, data['text']))\n",
    "                \n",
    "        keywords_rake = malaya.keyword.extractive.rake(data['text'], \n",
    "                                                  top_k = random.randint(25, 50))\n",
    "        keywords_rake = [simple_cleaning(k[1]) for k in keywords_rake if len(k[1].split()) >= 3 and len(k[1]) > 20 \\\n",
    "                        and len(set(k[1].lower().replace('-', '').split()) & months) == 0]\n",
    "        \n",
    "        already = set()\n",
    "        filtered = []\n",
    "        for k in keywords_rake:\n",
    "            if k.lower() in already:\n",
    "                continue\n",
    "            else:\n",
    "                already.add(k.lower())\n",
    "                filtered.append(k)\n",
    "                \n",
    "        keywords_rake = filtered\n",
    "        \n",
    "        if len(keywords_rake) > 5:\n",
    "            # print(keywords_rake)\n",
    "            accepted.append(('. '.join(keywords_rake), data['text']))\n",
    "            \n",
    "#         if random.random() > 0.1:\n",
    "#             continue\n",
    "        \n",
    "#         try:\n",
    "#             extractive = model_tfidf.sentence_level(data['text'], top_k = random.randint(3, 5))['summary']\n",
    "#             splitted = malaya.text.function.split_into_sentences(data['text'])\n",
    "#             if len(malaya.text.function.split_into_sentences(extractive)) <= (len(splitted) / 2):\n",
    "#                 accepted.append((extractive, data['text']))\n",
    "#         except Exception as e:\n",
    "#             print(e)\n",
    "            \n",
    "        count += 1"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 57,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "219612"
      ]
     },
     "execution_count": 57,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "len(accepted)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 58,
   "metadata": {},
   "outputs": [],
   "source": [
    "with open('news.json', 'w') as fopen:\n",
    "    json.dump(accepted, fopen)"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 59,
   "metadata": {},
   "outputs": [
    {
     "data": {
      "text/plain": [
       "('wilayah Okayama mencipta pelbagai haiwan. Jepun mencipta hasil seni menarik. \"Gambar kucing menangkap ikan. menghadiri pelbagai acara. menerima tawaran kerja. popular seekor kucing',\n",
       " 'Seorang barista di Jepun mencipta hasil seni menarik dan comel hanya menggunakan buih susu dalam latte dihidangkannya.\\n\\nKazuki Yamamoto, 29, dari wilayah Okayama mencipta pelbagai haiwan kecil yang nampak realistik ketika bekerja di sebuah restoran, sebelum hasil seninya itu mula mendapat perhatian di seluruh dunia.\\n\\nKerja tangannya yang paling popular seekor kucing yang cuba masuk dari satu cawan ke cawan lain untuk menangkap ikan.\\n\\nDia yang kini pengurus sebuah kafe di Harajyuku juga membuat buih berbentuk arnab dan kucing yang lain, dianggap pelanggannya sebagai ‘terlalu comel sehingga tidak tergamak untuk meminumnya.’\\n\\n“Gambar kucing menangkap ikan menjadi popular selepas tersebar di Internet sehingga saya mendapat tawaran untuk menghadiri pelbagai acara dan bekerja di luar negara.\\n\\n“Selepas itulah saya menerima tawaran kerja di kafe ini sekarang. Paling penting untuk rekaan saya adalah menggunakan krim yang lembut dan susu berkualiti tinggi.\\n\\n“Saya tidak menafikan teknik ini boleh ditiru, namun tiada sesiapa mampu meniru idea saya yang tersendiri ini. Bagi saya seni latte 3D ini mungkin mudah dimusnahkan, tetapi melihatnya menggembirakan pelanggan saya,” katanya.')"
      ]
     },
     "execution_count": 59,
     "metadata": {},
     "output_type": "execute_result"
    }
   ],
   "source": [
    "accepted[-1]"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {},
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 3 (ipykernel)",
   "language": "python",
   "name": "python3"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 3
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython3",
   "version": "3.8.10"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 4
}
